packages <- c("devtools","knitr","tidyverse","widgetframe","readr",
"wordcloud", "base64enc", "tidytext",
"RWeka","stats","manifestoR","readtext",
"rvest", "stringr",
"SnowballC", "plotrix", "tidyr", "tidytext", "stats",
"dendextend", "ggthemes",
"httr","jsonlite", "DT", "textdata", "ggmap","maptools","mapproj","rgeos","rgdal",
"RColorBrewer", "stringr","scales", "leaflet", 'leafpop', "ggthemes", "ggtext", "wordcloud")
packages <- lapply(packages, FUN = function(x) {
if(!require(x, character.only = TRUE)) {
install.packages(x)
library(x, character.only = TRUE)
}
}
)
For this project, we explored reported side effects of the covid-19 vaccine, focusing on adverse reactions reported from 2020-12-01 to 2021-3-31. The visualization aims to provide an insight into who are the people reporting side effects and how do they compare to the general population; the most common reported symptoms etc. We also analyzed tweets associated with the covid-19 vaccine to see people’s attitudes on the vaccine.
vac <- read_csv("covidvacdata/2021VAERSDATA.csv")
sym <- read_csv("covidvacdata/2021VAERSSYMPTOMS.csv")
vax <- read_csv("covidvacdata/2021VAERSVAX.csv")
#convert vaccination date
vac$VAX_DATE <- as.Date(vac$VAX_DATE, format = "%m/%d/%Y")
#merge the first two csv file by patient ID
merge1 <- left_join(vac, sym, by = "VAERS_ID")
#merge the third csv together
merge <- left_join(merge1, vax, by = "VAERS_ID")
#filter for COVID19 vaccine
covid <- merge %>%
filter(VAX_TYPE == "COVID19")
#find out patient's age and gender
agesex <- covid %>%
distinct(VAERS_ID,.keep_all = TRUE) %>% #distinct patients
select(AGE_YRS, SEX, VAX_DATE) %>%
filter(AGE_YRS >= 18 & AGE_YRS != 'NA', VAX_DATE >= "2020-12-01" & SEX != "U" ) #filter for age over 18, vaccination data after 2020-12-01 and filter out unknown value for sex
as <- agesex %>%
group_by(SEX,AGE_YRS) %>%
count(SEX,AGE_YRS)
ggplot(as, aes(x = AGE_YRS, y = n, color = SEX))+
geom_line(size = 1)+
labs(title = "Covid Vaccine Side Effects Reported Based on Age and Sex<br> By Mar 31, 2021",x="Age",y = "Number of People Reported") +
scale_x_continuous(breaks = seq(20,110,10), limits = c(15, 110)) + scale_y_continuous(breaks = seq(0,600,200), limits = c(0, 600))+
theme_minimal() + theme(plot.title = element_markdown(hjust = 0.5),legend.title = element_blank()) + scale_color_manual(values = c("#fdb863","#2d004b"),labels = c("Female", "Male"))
We could see that in general, women and younger people seem to suffer more from side effects. As age increased, the report number actually decreased, especially for women. Do elders suffer more from side effects? Not exactly. Is it possible there are fewer elders who got vaccinated thus fewer reports? We decided to dive deeper into who got vaccinated by looking at different age groups.
#vaccinated number by age group from cdc
agegroup <- read_csv("covidvacdata/demographic_trends_of_people_receiving_covid19_vaccinations_in_the_united_states.csv")
agegroup$Date <- as.Date(agegroup$Date, format = "%Y/%m/%d")
ageg <- agegroup %>%
filter(Date <= "2021-03-31") %>% #filter for date before 2021/03/31
filter(`Age Group` != "Age_unknown" & `Age Group` != "Age_known" & `Age Group` != "Ages_<18yrs") %>% #filter for age > 18
select(Date, `Age Group`, `People with at least one dose`, `Percent of age group with at least one dose`, Census)
ageg$Age <- with(ageg, ifelse(`Age Group` == "Ages_18-29_yrs", "18-29",
ifelse(`Age Group` == "Ages_30-39_yrs", "30-39",
ifelse(`Age Group` == "Ages_40-49_yrs", "40-49",
ifelse(`Age Group` == "Ages_50-64_yrs", "50-64",
ifelse(`Age Group` == "Ages_65-74_yrs", "65-74",
ifelse(`Age Group` == "Ages_75+_yrs", "75+", "other")))))))
g <- ggplot() + geom_line(data = ageg, aes(x = Date, y =`Percent of age group with at least one dose`, color = Age))+ scale_color_brewer(palette = "PuOr") +scale_y_continuous(breaks = seq(0,100,25), limits = c(0,100))+labs(title = "Percentage of People that Have Received at Least One Dose of Cov Vaccine<br>by Age Group", subtitle = "2020/12/16 - 2021/3/31",x="",y = "") + theme_minimal() + theme(plot.title = element_markdown(hjust=0.5)) + theme(legend.position = "top", legend.title = element_text(size = 8)) + theme(plot.subtitle=element_text(hjust=0.5)) + scale_y_continuous(labels = c("0" = "0", "25" = "25%", "50" = "50%", "75" = "75%", "100" = "100%")) + guides(colour = guide_legend(nrow = 1))
#, labels = c("18-29", "30-39", "40-49", "50-64", "65-74", "75+"
library(plotly)
ggplotly(g) %>% layout(title = paste0("Vaccinated Rate by Different Age Group", "<br>","2020/12/16- 2021/3/31"))
By mar 31, 2021, more than 75% of people over 65 had received at least one dose of covid-19 vaccine; a much higher percentage than younger group.
# report num
yrs <- as %>%
group_by(AGE_YRS) %>%
summarise(sum = sum(n))
yrs$group <- with(yrs, ifelse(AGE_YRS >= 18 & AGE_YRS <= 29, "Ages_18-29_yrs",
ifelse(AGE_YRS >= 30 & AGE_YRS <= 39, "Ages_30-39_yrs",
ifelse(AGE_YRS >= 40 & AGE_YRS <= 49, "Ages_40-49_yrs",
ifelse(AGE_YRS >= 50 & AGE_YRS <= 64, "Ages_50-64_yrs",
ifelse(AGE_YRS >= 65 & AGE_YRS <= 74, "Ages_65-74_yrs",
ifelse(AGE_YRS >= 75, "Ages_75+_yrs", "other")))))))
# report cases by different age group
reportbygroup <- yrs %>%
group_by(group) %>%
summarise(reportnumber = sum(sum))
#join vaccinated cases to calculate report rate
report <- left_join(reportbygroup,ageg, by = c("group" = "Age Group"))
reportrate <- report %>%
filter(Date == "2021-03-31") %>% # vaccinated number until 2021/03/31
select(group, reportnumber,`People with at least one dose`) %>%
mutate(Rate = round(reportnumber*10000 / `People with at least one dose`,2)) #rate
ggplot()+
geom_col(data = reportrate, aes(x = group, y = Rate, fill = group)) + scale_fill_brewer(palette = "PuOr")+
labs(title = "Number of Reported Side Effects Cases per10k Vaccinated People<br> by Mar 31, 2021",x="Age",y = "Number of Reported Cases") +
scale_x_discrete(labels = c("Ages_18-29_yrs" = "18-29", "Ages_30-39_yrs" = "30-39", "Ages_40-49_yrs" = "40-49", "Ages_50-64_yrs" = "50-64", "Ages_65-74_yrs" = "65-74", "Ages_75+_yrs" = "75+")) +
theme_minimal() +
theme(plot.title = element_markdown(hjust = 0.5),legend.position = "none")
By mar31,2021, for people over 75 years old, fewer than 3 of every 10,000 people who received at least one dose of COVID-19, reported adverse effects. In contrast, those aged 30 to 39 reported the highest rate of adverse effects, at 6 out of every 10, 000 people who received the vaccine. So, counter-intuitively, it seems that elders are less vulnerable to side effects. Some articles suggest that the immune response may actually be stronger in the younger group, so side effects may be more pronounced for the younger. The results of our data seem to confirm this. However, there may be other reasons for the lower rate of reported side effects in the elderly group, such as the elderly group is not as likely to use computers as the younger group, which affects the reporting rate, etc
Medical history and Pre-illness are also strong indicators to predict suitable candidates for covid-19 vaccines. So we decided to look at the most common pre-illness of these people who reported adverse reactions.
his <- vac %>%
filter(AGE_YRS >= 18 & AGE_YRS != 'NA', VAX_DATE >= "2020-12-01" & SEX != "U" ) %>% #filter for age >18 , vaccination data after 2020-12-01 and filter out unknown value for sex
filter(HISTORY != "None" & HISTORY != "NA" & HISTORY !="N/A" & HISTORY != "n/a" & HISTORY !="no" & HISTORY != "none") %>%
select(VAERS_ID,HISTORY) %>%
rename(doc_id = VAERS_ID, text = HISTORY)
#cleaning
new_stops <- c("no", "history","medical", "conditions", "historyconcurrent", "disease","patient", "relevant","chronic","none", stopwords("en"))
his$text <- iconv(his$text, "UTF-8", "UTF-8",sub='')
his$text = removePunctuation(his$text)
his$text=tolower(his$text)
his$text=removeWords(his$text, new_stops)
his$text=removeNumbers(his$text)
require(RWeka)
# Make tokenizer function
tokenizer <- function(x)
NGramTokenizer(x, Weka_control(min = 1, max = 3))
hisd <- data.frame("history" = tokenizer(his$text))
hisd <- hisd %>%
group_by(history) %>%
count(history) %>%
arrange(desc(n))
#write.csv(hisd, "/Users/hannahz/Desktop/data visualization\\history.csv")
#after counting, I manually filtered most common 50 pre-illness. I combind the same illness (hypertension, high blood pressure)
history <- read_csv("covidvacdata/pre-illness_history_clean.csv")
purple_orange <- brewer.pal(10, "RdYlBu")
set.seed(2103)
wordcloud(history$history, history$n,
max.words = 50, colors = purple_orange)
Hypertension, asthma, and diabetes are the most common pre-illness mentioned by people reporting side effects. One thought is that respondents with those diseases might be more vulnerable to vaccine side effects. However, Is it possible that those symptoms reported are actually caused by pre-illness but rather than vaccines? People might associate health problems that would have happened anyway with the vaccines. We haven’t come up with a more accurate way to describe this relationship but it is worth exploring.
nums <- vac %>%
filter(AGE_YRS >= 18 & AGE_YRS != 'NA', VAX_DATE >= "2020-12-01" & SEX != "U" & NUMDAYS != "NA") %>%
select(SEX,AGE_YRS,NUMDAYS)
nums$group <- with(nums, ifelse(AGE_YRS >= 18 & AGE_YRS <= 29, "18-29",
ifelse(AGE_YRS >= 30 & AGE_YRS <= 39, "30-39",
ifelse(AGE_YRS >= 40 & AGE_YRS <= 49, "40-49",
ifelse(AGE_YRS >= 50 & AGE_YRS <= 64, "50-64",
ifelse(AGE_YRS >= 65 & AGE_YRS <= 74, "65-74",
ifelse(AGE_YRS >= 75, "75+", "other")))))))
numbyages <- nums %>%
group_by(SEX,group) %>%
summarise(mean = mean(NUMDAYS))
ggplot(numbyages, aes(x = group, y = mean, fill = SEX)) + geom_col(position = "dodge", width = 0.6) + labs(title = "Side Effects Average Onset Days After Vaccination", x ="Age", y = "Number of Days after Vaccination") +
theme_minimal() + scale_y_continuous(breaks = seq(1,6,1), limits = c(0,6)) + theme(plot.title = element_text(hjust = 0.5), legend.title = element_blank())+scale_fill_manual(values = c("#fdb863","#2d004b"),labels = c("Female", "Male"))
In general, the younger group seems to be more sensitive about side effects; they reported symptoms under 3 days after vaccination. Elders tend to be slower in feeling the symptoms. Side effects tend to kick in early for males in the younger group. In the elder group, on the contrary, females seem to suffer earlier from the symptoms.
#select symptoms out
sympword <- covid %>%
select(SYMPTOM1, SYMPTOM2, SYMPTOM3, SYMPTOM4, SYMPTOM5)
# list all the symptoms out in a column
symd1 <- data.frame(symptom=unlist(sympword, use.names = FALSE))
#filter out na value
symd1 <- na.omit(symd1)
common10 <- symd1 %>%
group_by(symptom) %>%
count(symptom) %>%
arrange(desc(n)) %>%
ungroup() %>%
mutate(rank = row_number()) %>%
filter(rank <= 10)
ggplot(common10) + geom_col(aes(x = n, y = reorder(symptom, n)), fill ="#fdd49e", width = 0.7) + scale_x_continuous(name = "Number of Symptom Reported", breaks = seq(0,8000,1000)) + labs(title = "Top10 Side Effect Symptoms", y = "")+theme_minimal() + theme_minimal() + theme(plot.title = element_text(hjust = 0.5))
senti <- vac %>%
filter(AGE_YRS >= 18 & AGE_YRS != 'NA', VAX_DATE >= "2020-12-01" & SEX != "U") %>%
select(VAERS_ID,SYMPTOM_TEXT) %>%
rename(doc_id = VAERS_ID, text = SYMPTOM_TEXT)
senti$text <- iconv(senti$text, "UTF-8", "UTF-8",sub='')
#convert to a df corpus
df_source <- DataframeSource(senti)
df_corpus <- VCorpus(df_source)
#filter out neutral words
new_stops <- c("can", "make", "patient", "vaccine", "received","included","medical","experienced", "outcome", "feeling","receiving", "time", "female", "information", "immunization","shot", "nurse","doctor", "action" ," including","noted","shoulder","physician","visit","reporter", "ethics", "resident",stopwords("en"))
#clean corpus
clean_corpus <- function(corpus){
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, c(new_stops))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
return(corpus)
}
cleancor <- clean_corpus(df_corpus)
top_dtm <- DocumentTermMatrix(cleancor)
# convert to tidytext
top_td <- tidy(top_dtm)
#using NRC
# I had some difficulty connecting to the tidytext package and downloading the NRC(probabily because I'm using a vpn) , so I downloaded manually
txt <- read.table("covidvacdata/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt")
txt <- txt %>%
filter(V3 != 0) %>%
select(V1, V2)
nrc <- dplyr::rename(txt, word = V1, sentiment = V2)
emotion <- top_td %>%
inner_join(nrc, by = c(term = "word")) %>%
group_by(sentiment) %>%
count(term) %>%
arrange(desc(n)) %>%
mutate(rank = row_number()) %>%
mutate(number = n/1000) %>%
filter(rank <= 10)
ggplot(data = emotion, aes(reorder(term, number), number, fill=sentiment)) +geom_bar(stat="identity", show.legend = FALSE) +scale_y_continuous(breaks = seq(0,6,2), labels = c("0" = "0", "2" = "2k", "4" = "4k", "6" = "6k"))+facet_wrap(~sentiment, scales="free_y", ncol=5) +labs(y = "Number of Times Expressed", x = NULL, title = "Emotion Words Expressed in Symptoms Description") +coord_flip() + scale_fill_brewer(palette = "Paired") + theme(plot.title = element_text(hjust = 0.5))
Negative, disgust, fear, sadness are the most frequently expressed emotions in the symptoms text, which is reasonable because people were experiencing uncomfortable feelings in their bodies.